In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import validation_curve, learning_curve
In [2]:
path = '../Final report'
X_fn = 'x.csv'
y_fn = 'y.csv'
X_path = os.path.join(path, X_fn)
y_path = os.path.join(path, y_fn)
X = pd.read_csv(X_path)
y = pd.read_csv(y_path)
In [7]:
X.head()
Out[7]:
Make fuel price a ratio of the coal price to the natural gas price
In [3]:
for fuel in ['All coal', 'Lignite', 'Subbituminous']:
X.loc[:,fuel] = X.loc[:,fuel].values/X.loc[:,'NG Price ($/mcf)'].values
X.drop('NG Price ($/mcf)', axis=1, inplace=True)
In [4]:
cluster_ids = X['cluster'].unique()
for cluster in cluster_ids:
X['cluster_{}'.format(cluster)] = np.eye(len(cluster_ids))[X['cluster'],cluster]
In [5]:
X.head()
Out[5]:
In [6]:
X.tail()
Out[6]:
In [46]:
free_cap_dict = {}
for cluster in range(6):
free_cap_dict[cluster] = X.loc[X['cluster'] == cluster, ['DATETIME', 'nameplate_capacity', 'GROSS LOAD (MW)']]
col_name = 'cluster_' + str(cluster) + ' free capacity'
free_cap_dict[cluster].loc[:,col_name] = (free_cap_dict[cluster].loc[:,'nameplate_capacity'].values -
free_cap_dict[cluster].loc[:,'GROSS LOAD (MW)'].values)
In [47]:
free_cap_dict[0].head()
Out[47]:
In [49]:
for cluster in range(6):
col_name = 'cluster_' + str(cluster) + ' free capacity'
X = pd.merge(X, free_cap_dict[cluster].loc[:,['DATETIME', col_name]], on='DATETIME')
In [51]:
X.head(n=10)
Out[51]:
In [ ]:
for idx in X.index:
datetime = X.loc[idx, 'DATETIME']
for cluster in range(6):
col_name = 'cluster_' + cluster + ' free capacity'
X.loc[idx, col_name] =
In [ ]:
In [ ]:
In [ ]:
In [8]:
y.tail()
Out[8]:
In [7]:
X_cols = ['nameplate_capacity', 'GROSS LOAD (MW)', 'ERCOT Load, MW',
'Total Wind Installed, MW', 'Total Wind Output, MW', 'Net Load Change (MW)',
'All coal', 'Lignite', 'Subbituminous']
X_cluster_cols = ['cluster_{}'.format(cluster) for cluster in cluster_ids]
# X_cluster_free_cols = ['cluster_{} free capacity'.format(cluster) for cluster in cluster_ids]
X_clean = X.loc[:,X_cols+X_cluster_cols]#+X_cluster_free_cols]
X_clean.fillna(0, inplace=True)
y_clean = y.loc[:,'Gen Change (MW)']
y_clean.fillna(0, inplace=True)
In [8]:
print X_clean.shape
print y_clean.shape
In [9]:
X_clean.head()
Out[9]:
In [10]:
X_train = X_clean.loc[(X['Year']<2012),:]
y_train = y_clean.loc[(X['Year']<2012)]
X_va = X_clean.loc[X['Year'].isin([2012, 2013]),:]
y_va = y_clean.loc[X['Year'].isin([2012, 2013])]
X_test = X_clean.loc[X['Year']>2013,:]
y_test = y_clean.loc[X['Year']>2013]
Need scaled versions of the X data for some of the models
In [11]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_va_scaled = StandardScaler().fit_transform(X_va)
X_test_scaled = StandardScaler().fit_transform(X_test)
Check size of all arrays
In [21]:
print X_train_scaled.shape, y_train.shape
print X_va_scaled.shape, y_va.shape
print X_test_scaled.shape, y_test.shape
In [59]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
Out[59]:
In [60]:
lm.score(X_va_scaled, y_va)
Out[60]:
In [24]:
y_pr = lm.predict(X_va_scaled)
In [41]:
y_va.values.shape, y_pr.shape, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values.shape
Out[41]:
In [25]:
y_lm_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
[y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
# y_lm_resids['y_pr'] = y_pr
# y_lm_resids['cluster'] = X.loc[:,'cluster']
In [26]:
y_lm_resids.head()
Out[26]:
In [27]:
y_lm_resids.loc[:,'residuals'] = y_lm_resids.loc[:,'y_pr'] - y_lm_resids.loc[:,'Gen Change (MW)']
In [29]:
g = sns.FacetGrid(y_lm_resids, hue='cluster', col='cluster',
col_wrap=3)
g.map(plt.scatter, 'Gen Change (MW)', 'residuals')
g.add_legend()
Out[29]:
Out[29]:
In [12]:
from xgboost import XGBRegressor
Validation curve for n_estimators
In [13]:
param_values = [25, 100, 250, 350]
train_scores, valid_scores = validation_curve(XGBRegressor(), X_train, y_train, "n_estimators", param_values,
n_jobs=-1, verbose=3)
In [14]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
In [15]:
plt.title("Validation Curve with XGBoost", size=15)
plt.xlabel("n_estimators", size=15)
plt.ylabel("Score", size=15)
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(param_values, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_values, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.plot(param_values, valid_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_values, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.savefig('XGBoost n_estimators validation curve.pdf', bbox_inches='tight')
Out[15]:
Out[15]:
Out[15]:
Out[15]:
Out[15]:
Out[15]:
Out[15]:
Out[15]:
Out[15]:
Validation curve for n_estimators
In [16]:
param_values = [1,3,5,9,15]
train_scores, valid_scores = validation_curve(XGBRegressor(n_estimators=250), X_train, y_train, "max_depth", param_values,
n_jobs=-1, verbose=3)
In [17]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
In [18]:
plt.title("Validation Curve with XGBoost", size=15)
plt.xlabel("max_depth", size=15)
plt.ylabel("Score", size=15)
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(param_values, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_values, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.plot(param_values, valid_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_values, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.savefig('XGBoost max_depth validation curve.pdf', bbox_inches='tight')
Out[18]:
Out[18]:
Out[18]:
Out[18]:
Out[18]:
Out[18]:
Out[18]:
Out[18]:
Out[18]:
Validation curve for reg_alpha
In [68]:
param_values = np.logspace(-5, 1, 7)
train_scores, valid_scores = validation_curve(XGBRegressor(n_estimators=250), X_train, y_train, "reg_alpha", param_values,
n_jobs=-1, verbose=3)
In [69]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
In [71]:
plt.title("Validation Curve with XGBoost")
plt.xlabel("reg_alpha")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_values, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_values, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_values, valid_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_values, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
Out[71]:
Out[71]:
Out[71]:
Out[71]:
Out[71]:
Out[71]:
Out[71]:
Out[71]:
Out[71]:
Learning curve for n_estimators=250 and max_depth=3
In [37]:
param_values = [1,3,5,9,15]
train_sizes, train_scores, valid_scores = learning_curve(XGBRegressor(n_estimators=250), X_train, y_train,
n_jobs=-1, verbose=3)
In [38]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
In [39]:
plt.title("Learning Curve with XGBoost", size=15)
plt.xlabel("Sample size", size=15)
plt.ylabel("Score", size=15)
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(train_sizes, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.plot(train_sizes, valid_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.savefig('XGBoost learning curve.pdf', bbox_inches='tight')
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
In [22]:
xgbr = XGBRegressor(n_estimators=250)
In [23]:
xgbr.fit(X_train, y_train)
Out[23]:
In [25]:
y_pr = xgbr.predict(X_va)
y_xgbr_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
[y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
In [27]:
y_xgbr_resids.loc[:,'residuals'] = y_xgbr_resids.loc[:,'y_pr'] - y_xgbr_resids.loc[:,'Gen Change (MW)']
In [ ]:
plt.scatter()
In [39]:
with sns.axes_style('whitegrid'):
g = sns.FacetGrid(y_xgbr_resids, hue='cluster', col='cluster',
col_wrap=3)
g.map(plt.scatter, 'y_pr', 'residuals', s=5, alpha=.3)
g.set_xlabels(size=15)
g.set_ylabels(size=15)
plt.savefig('XGBR residuals.pdf')
Out[39]:
Out[39]:
Out[39]:
In [ ]:
In [ ]:
In [15]:
model = XGBRegressor()
In [16]:
subsample = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
param_grid = dict(subsample=subsample)
In [18]:
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, verbose=3)
In [19]:
result = grid_search.fit(X_train_scaled, y_train)
In [20]:
result.cv_results_
Out[20]:
In [ ]:
In [22]:
model = XGBRegressor()
In [23]:
colsample_bytree = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
param_grid = dict(colsample_bytree=colsample_bytree)
In [24]:
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, verbose=3)
In [25]:
result = grid_search.fit(X_train_scaled, y_train)
In [26]:
result.cv_results_
Out[26]:
In [ ]:
In [27]:
model = XGBRegressor()
In [28]:
colsample_bylevel = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
param_grid = dict(colsample_bylevel=colsample_bylevel)
In [29]:
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, verbose=3)
In [30]:
result = grid_search.fit(X_train_scaled, y_train)
In [31]:
result.cv_results_
Out[31]:
In [ ]:
In [32]:
model = XGBRegressor()
In [33]:
max_depth = [3, 6, 9]
n_estimators = [100, 250, 500]
reg_alpha = [1e-5, 1e-3, 0.1]
reg_lambda = [1e-3, 0.1, 1]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators,
reg_alpha=reg_alpha, reg_lambda=reg_lambda)
In [35]:
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, verbose=2)
In [36]:
result = grid_search.fit(X_train_scaled, y_train)
In [37]:
import cPickle as pickle
In [38]:
pickle.dump((grid_search, result), open( "xgb gridsearch and results.pkl", "wb" ) )
In [39]:
result.cv_results_
Out[39]:
In [41]:
result.best_estimator_
Out[41]:
In [40]:
grid_search.score(X_va_scaled, y_va)
Out[40]:
In [42]:
xgb = XGBRegressor(n_estimators=250, reg_alpha=0.1)
In [43]:
xgb.fit(X_train, y_train)
Out[43]:
In [44]:
xgb.score(X_va, y_va)
Out[44]:
In [46]:
y_pr = xgb.predict(X_va)
In [47]:
y_xgb_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
[y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
In [50]:
y_xgb_resids.loc[:,'residuals'] = y_xgb_resids.loc[:,'y_pr'] - y_xgb_resids.loc[:,'Gen Change (MW)']
In [51]:
g = sns.FacetGrid(y_xgb_resids, hue='cluster', col='cluster',
col_wrap=3)
g.map(plt.scatter, 'y_pr', 'residuals')
g.add_legend()
Out[51]:
Out[51]:
In [52]:
y_xgb_resids.describe()
Out[52]:
In [54]:
X_train.columns
Out[54]:
In [66]:
X_train_ratio = X_train.copy()
X_va_ratio = X_va.copy()
for fuel in ['All coal', 'Lignite', 'Subbituminous']:
X_train_ratio.loc[:,fuel] = X_train_ratio.loc[:,fuel].values/X_train_ratio.loc[:,'NG Price ($/mcf)'].values
X_va_ratio.loc[:,fuel] = X_va.loc[:,fuel]/X_va.loc[:,'NG Price ($/mcf)']
X_train_ratio.drop('NG Price ($/mcf)', axis=1, inplace=True)
X_va_ratio.drop('NG Price ($/mcf)', axis=1, inplace=True)
In [67]:
X_train.head()
Out[67]:
In [68]:
X_train_ratio.head()
Out[68]:
In [71]:
xgb_ratio = XGBRegressor(n_estimators=250, reg_alpha=0.1)
In [72]:
xgb_ratio.fit(X_train_ratio, y_train)
Out[72]:
In [73]:
xgb_ratio.score(X_va_ratio, y_va)
Out[73]:
In [75]:
y_pr = xgb_ratio.predict(X_va_ratio)
In [76]:
y_xgb_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
[y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
In [77]:
y_xgb_resids.loc[:,'residuals'] = y_xgb_resids.loc[:,'y_pr'] - y_xgb_resids.loc[:,'Gen Change (MW)']
In [78]:
g = sns.FacetGrid(y_xgb_resids, hue='cluster', col='cluster',
col_wrap=3)
g.map(plt.scatter, 'y_pr', 'residuals')
g.add_legend()
Out[78]:
Out[78]:
In [79]:
y_xgb_resids.describe()
Out[79]:
In [81]:
from xgboost import plot_importance
In [82]:
plot_importance(xgb_ratio)
Out[82]:
In [ ]:
In [83]:
lm = LinearRegression(normalize=True)
lm.fit(X_train_ratio, y_train)
Out[83]:
In [84]:
lm.score(X_va_ratio, y_va)
Out[84]:
In [24]:
y_pr = lm.predict(X_va_scaled)
In [41]:
y_va.values.shape, y_pr.shape, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values.shape
Out[41]:
In [25]:
y_lm_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
[y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
# y_lm_resids['y_pr'] = y_pr
# y_lm_resids['cluster'] = X.loc[:,'cluster']
In [26]:
y_lm_resids.head()
Out[26]:
In [27]:
y_lm_resids.loc[:,'residuals'] = y_lm_resids.loc[:,'y_pr'] - y_lm_resids.loc[:,'Gen Change (MW)']
In [29]:
g = sns.FacetGrid(y_lm_resids, hue='cluster', col='cluster',
col_wrap=3)
g.map(plt.scatter, 'Gen Change (MW)', 'residuals')
g.add_legend()
Out[29]:
Out[29]:
In [ ]: